import csv
import random
import numpy as np
from openpyxl import Workbook

# Define file paths and parameters
input_csv = 'last_run_lineage_arraysinh50div.01ratefaster_good_46cpg_rand20.csv'  # Input CSV file in the working directory
output_xlsx = 'output_matrix.xlsx'  # Output Excel file in the working directory (only for last run)
average_csv = 'ave.csv'  # Output CSV file for average and minimum values
N = 273  # Number of rows to randomly select
R = 1000  # Number of runs

# Function to calculate pairwise distances
def calculate_pairwise_distance(row1, row2):
    differing_cells = sum(1 for a, b in zip(row1, row2) if a != b)
    return differing_cells / len(row1)

# Accumulate results for each run
run_results = []

# Function to process a single run
def process_run(run_number):
    global run_results
    
    # Randomly select N rows
    if len(data) < N:
        raise ValueError("Not enough rows in the CSV to select N rows.")
        
    selected_indices = random.sample(range(len(data)), N)
    selected_headers = [headers[i] for i in selected_indices]
    selected_data = [data[i] for i in selected_indices]

    # Create a matrix for pairwise distances
    distance_matrix = np.full((N, N), np.nan)

    for i in range(N):
        for j in range(N):
            if i != j:
                distance_matrix[i, j] = calculate_pairwise_distance(selected_data[i], selected_data[j])

    # If it's the last run, write the output matrix to an Excel file
    if run_number == R:
        wb = Workbook()
        ws = wb.active

        # Write header row
        ws.append([''] + selected_headers)

        # Write each row
        for i, header in enumerate(selected_headers):
            row = [header] + [f"{distance_matrix[i, j]:.4g}" if not np.isnan(distance_matrix[i, j]) else "NaN" for j in range(N)]
            ws.append(row)

        # Save the workbook
        wb.save(output_xlsx)
        print(f"Pairwise distance matrix for run {run_number} has been written to {output_xlsx}.")

    # Calculate the average and minimum values
    matrix_array = np.array(distance_matrix)
    differences = matrix_array[np.isfinite(matrix_array)]  # Extract non-NaN values
    average_difference = np.mean(differences)
    min_values = np.nanmin(matrix_array, axis=1)

    # Store results for this run
    results = [f"{average_difference:.4f}"] + [f"{min_val:.4f}" for min_val in min_values]
    run_results.append(results)

# Step 1: Read the input CSV file
with open(input_csv, mode='r', newline='', encoding='utf-8') as infile:
    reader = csv.reader(infile)
    headers = []
    data = []
    
    for row in reader:
        headers.append(row[0])
        data.append(row[1:])

# Perform the runs
for run_number in range(1, R + 1):
    process_run(run_number)

# Write the accumulated results to ave.csv
with open(average_csv, mode='w', newline='', encoding='utf-8') as outfile:
    writer = csv.writer(outfile)

    # Write the header row
    header_row = [''] + [f'Run {i+1}' for i in range(R)]
    writer.writerow(header_row)

    # Find the maximum number of rows needed
    max_rows = max(len(results) for results in run_results)
    
    # Write each row of results
    for i in range(max_rows):
        row = [run_results[j][i] if i < len(run_results[j]) else '' for j in range(R)]
        writer.writerow(row)

print(f"Average and minimum values have been written to {average_csv}.")

